########### Algorithmic representation: Study 1 VPN audit ######
# Code for the replication of the analyses reported in the paper:
# Finding the white male: The prevalence and consequences of 
#algorithmic gender and race bias in political Google searches

# doi: XXXXXXXXXXXXXX

# Contact corresponding author for help/information with code problems


#  OVERVIEW OF CODE SECTIONS----------------------------------------------------

# 0: Data preparation
# H1: Tests of baseline bias hypothesis
# H2: Tests of distribution bias hypothesis



#if using r studio, display content outline with Ctrl + Shift + O


# 0 - Data preparation ----------------------
#libraries
library(tidyverse) # for tidy data manipulations
library(lme4) # For multilevel models
library(lmerTest) # for multilevel models
library(marginaleffects) # extracting results from multilevel models
library(sjPlot) # exporting regression results
library(sjmisc) #exporting regresion resuls
library(psych) # for corr.test()
library(ggrepel) #for visualizations 
library(ggplot2)
library(ggpmisc)
library(gghighlight)
library(countrycode)
library(cowplot)
library(lemon)

#enter local path to the folder containing the replication files
path_to_folder <- "" 
setwd(path_to_folder) #set working directory to that folder

data <- read.csv("data/vpn_data_study1.csv", sep = ";") #data from first vpn audit
country_data <- read.csv("data/country_data_vpn.csv", sep = ";") #external data about countries and their legislative bodies (from interpariamentary-union)

# the following lines perform basic transformations to prepare the data
data <- data %>%
  mutate( 
    #note that the variable "main_language" was not used in the analyses;
    #it needs to be calculated to filter out duplicate queries
    main_language = case_when(
      country == "Belgium" & language != "Dutch" ~ "minority language",
      country == "Canada" & language != "English" ~ "minority language",
      country == "India" & language != "Hindi" ~ "minority language",
      country == "Kazakhstan" & language != "Kazakh" ~ "minority language",
      country == "Lebanon" & language != "Arabic" ~ "minority language",
      country == "Luxembourg" & language != "Luxembourgish" ~ "minority language",
      country == "Malta" & language != "Maltese" ~ "minority language",
      country == "Spain" & language != "Spanish" ~ "minority language",
      country == "Switzerland" & language != "German" ~ "minority language",
      country == "United States" & language != "English" ~ "minority language",
      TRUE ~ "majority language"),
    gender_ratio = (number_female - number_male) / (number_female + number_male),
   #dummy variable indicating whether an image contains at least one woman
     presence_female = case_when(
      number_female >= 1 ~ 1,
      number_female < 1 ~ 0),
   #dummy variable indicating whether an image contains at least one man
    presence_male = case_when(
      number_male >= 1 ~ 1,
      number_male < 1 ~ 0),
   #main DV as the share of women given the total number of persons depicted in an image 
   share_female = number_female / number_of_persons,
     gender_ratio = (number_female - number_male) / (number_female + number_male) #see pre-registration; only used in appendix Table A1.3
  )

#prepare external country data
country_info <- country_data %>%
  #pivot into long format to match audit data
  pivot_longer(
    cols = c(lower_share, upper_share),
    names_to = c("chamber","share"),
    names_sep = "_",
    values_to = "representation")%>%
  #add an id that can be used to match while merging datasets below
  mutate(
    search_id = paste0(country,"_","_",language,"_",chamber))%>%
 #only select relevant variables
   dplyr::select(representation, search_id, lower_seats,	lower_women,
         gggi_score, gggi_rank, region, quota, quota_type,
         upper_seats,	upper_women,	global_rank, main_language)


#merge data in 2 steps
data <-data %>%
  #generate an equivalent search_id for merging in audit data
  group_by(country, chamber, language) %>%
  mutate(
    search_id = paste0(country,"_","_",language,"_",chamber)) %>%
  dplyr::select(-main_language) %>%
  #join data sets by the common search_id
  left_join(., country_info, by = "search_id") %>%
  ungroup()

#create an aggregated data at country-level (used for country-level correlations)
  data_aggregated <-data %>%
  group_by(country, chamber, language) %>%
  summarise(
    number_persons = mean(number_of_persons, na.rm = TRUE),
    number_women = mean(number_female, na.rm = TRUE),
    number_men = mean(number_male, na.rm = TRUE),
    presence_women = mean(presence_female, na.rm = TRUE),
    presence_men = mean(presence_male, na.rm = TRUE),
    share_women = mean(share_female, na.rm = TRUE),
    gender_ratio = mean(gender_ratio, na.rm = TRUE) 
  ) %>%
  mutate(
  search_id = paste0(country,"_","_",language,"_",chamber))%>%
  left_join(., country_info, by = "search_id")


#finally, we create per chamber data subsets to run separate analyses
#note that we filter out queries where the used language does
# not correspond to the main language, e.g., french in Canada or Spanish in the USA

d_lower <- data %>%
  filter(main_language == "majority language") %>%
  filter(chamber == "lower") 

d_upper <- data %>%
  filter(main_language == "majority language") %>%
  filter(chamber == "upper") 




# H1 - Baseline bias ----------------------
#below we calculatate the mixed effects models with random
# intercepts at the country levels. 
# We estimate women's representation in search engine output
# as the fixed effect intercept for different measures

## number female -------------
number_female_lower <- lmer(data = d_lower,
                  number_female ~ 1 
                  + (1 | country) )

number_female_upper <- lmer(data = d_upper,
                            number_female ~ 1 
                            + (1 | country) )

## number male -------------
number_male_lower <- lmer(data = d_lower,
                            number_male ~ 1 
                            + (1 | country) )
number_male_upper <- lmer(data = d_upper,
                            number_male ~ 1 
                            + (1 | country) )


## presence female -------------
presence_female_lower <- glmer(data = d_lower,
                         presence_female ~ 1 +
                         (1 | country),
                         family = "binomial")

presence_female_upper <- glmer(data = d_upper,
                               presence_female ~ 1 +
                                 (1 | country),
                               family = "binomial")

## presence male -------------
presence_male_lower <- glmer(data = d_lower,
                               presence_male ~ 1 +
                                 (1 | country),
                               family = "binomial")

presence_male_upper <- glmer(data = d_upper,
                               presence_male ~ 1 +
                                 (1 | country),
                               family = "binomial")

## gender ratio -------------

ratio_female_lower <- lmer(data = d_lower,
                      gender_ratio ~ 1
                      + (1 | country))
ratio_female_upper <- lmer(data = d_upper,
                           gender_ratio ~ 1
                           + (1 | country))

## female share  -------------
# these are the models reported in the text of the manuscript
share_female_lower <- lmer(data = d_lower,
                           share_female ~ 1
                           + (1 | country))
share_female_upper <- lmer(data = d_upper,
                           share_female ~ 1
                           + (1 | country))

# The functions below create output tables that are in the appendix;
# They also show the results for Table 1 (study 1) in the main manuscript

#The code below produces an output of the upper part of Table C1.1a in the appendix;
tab_model(share_female_lower, presence_female_lower,
          presence_male_lower, number_female_lower, number_male_lower,
          file = "outputs/Study1_TableC1.1a_LowerChamber.doc")

#The code below produces an output of the lower part of Table C1.1a in the appendix;
tab_model(share_female_upper, presence_female_upper,
          presence_male_upper, number_female_upper, number_male_upper,
          file = "outputs/Study1_TableC1.1a_UpperChamber.doc")


#The code below produces an output of Table C1.3 in the appendix
tab_model(ratio_female_lower, ratio_female_upper,
          file = "outputs/Study1_TableC1.3_AlternativeOperationalization.doc")


# H2 - Distribution bias ----------------------
# Below we test the distribution bias hypothesis in two ways
# First run correlations on country-level data sets
# Second, we repeat the same models as above but with representation
# as a predictor (along with control variables)

## country-level correlations -------------------- 

#upper chamber
data_aggregated %>%
  filter(chamber == "upper") %>%
  ungroup() %>%
  dplyr::select(share_women, representation) %>%
  drop_na()%>%
  psych::corr.test()%>%
  print(short = FALSE)


# lower chamber
data_aggregated %>%
  filter(chamber == "lower") %>%
  ungroup() %>%
  dplyr::select(share_women, representation) %>%
  drop_na()%>%
  psych::corr.test()%>%
  print(short = FALSE)

## mixed effect models ---------------------


share_combined_additional <- lmer(data = data,
                           share_female*100 ~ 
                             representation + chamber +
                             main_language +
                             gggi_score + quota  +
                              (1 | country) )

number_combined_additional <- lmer(data = data,
                                  number_female ~ 
                                    representation + chamber +
                                    main_language +
                                    gggi_score + quota  +
                                    (1 | country) )

presence_combined_additional <- glmer(data = data,
                                  presence_female ~ 
                                    representation + chamber +
                                    main_language +
                                    gggi_score + quota  +
                                    (1 | country),
                                  family = "binomial")

tab_model(share_combined_additional, number_combined_additional,
          presence_combined_additional,
          file = "outputs/Study1_TableC1.2_AdditionalModelWithControls.doc")


# visualization: Figure 2 --------------------------------------

# Panel A of Figure 1 in the main manuscript: correlations

country_plot <- data_aggregated%>%
  filter(main_language == "majority language")%>%
  filter(country != "Saudi Arabia") %>%
  ggplot(.,
         aes(y=share_women, x = representation/100, label = country)) +
  geom_abline(slope = 1, intercept = 0, color = "red", alpha = .4,
              linetype = "dashed")+
  geom_point(alpha = .85, color = "#3b528b")+
  geom_smooth(method = "lm", color = "#3b528b", fill = "#3b528b")+
  geom_text_repel(size = 2.5,
                  segment.color = 'grey50')+
  stat_poly_eq(use_label(c("R2", "p","n"))) +
  facet_wrap(~chamber)+
  theme_classic()+
  coord_flex_cart(bottom= capped_horisontal(), left = capped_vertical()) +
  labs(x = "Share of women in parliament", y = "Share of women in Google image searches")+
  scale_y_continuous(labels = scales::percent_format(),
                     breaks = seq(0,1,.1),
                     limits = c(0,.6)
  ) +
  scale_x_continuous(labels = scales::percent_format(),
                     breaks = seq(0,1,.1), 
                     limits = c(0,.6)
  ) +
  theme(
    strip.text.x = element_text(
      size = 12, face = "bold"),
    strip.background = element_blank())



# Panel B: Rep - diff plot
#prepare data
diff_data <-data %>%
  filter(main_language == "majority language") %>%
  mutate(diff = (share_female*100) - representation) %>%
  group_by(country, chamber) %>%
  summarise(
    mean_diff = mean(diff, na.rm = TRUE),
    sd_diff = sd(diff, na.rm = TRUE),
    se_diff = sd_diff / sqrt(n()), # Standard error calculation
    lower_ci = mean_diff - qt(0.975, n() - 1) * se_diff, # Lower CI
    upper_ci = mean_diff + qt(0.975, n() - 1) * se_diff) %>%
  mutate(
    iso2c = countrycode(country, origin = 'country.name', destination = 'iso2c'),
    bias = case_when(
      lower_ci >0 ~ "overestimation",
      upper_ci < 0 ~ "underestimation",
      lower_ci < 0 | upper_ci > 0 ~ "no diff"))





diff_plot <- diff_data %>%
  ggplot(
    aes(
      x = reorder(iso2c,mean_diff), shape = chamber, group = chamber,
      y = mean_diff, ymin = lower_ci, ymax = upper_ci,
      color = bias)) +
  geom_hline(yintercept = 0, linetype = "dashed")+
  geom_pointrange(
    position = position_dodge(.35)) +
  theme_classic()+
  coord_flex_cart(bottom= capped_horisontal(), left = capped_vertical()) +
  labs(x = "",
       shape = "",
       y = expression(Delta~"Representation"))+
  scale_y_continuous(labels = scales::percent_format(scale = 1),
                     breaks = seq(-40,40,10),
                     limits = c(-40,40)) +
  scale_color_manual(values = c("grey73","#35b779","#440154"))+
  theme(
    legend.position = c(.9,.25),
    strip.text.x = element_text(
      size = 12, face = "bold"),
    strip.background = element_blank())+
  guides(color = "none")



Figure1 <- cowplot::plot_grid(country_plot, diff_plot, labels = "AUTO",
                         nrow = 2,
                         rel_heights = c(2, 1))


ggsave(Figure1, filename = "outputs/Study1_Figure2.jpeg",
       width = 9, height = 7, dpi = 800)
